Overview

This document contains plots and statistical analyses for Homm et al. “Can Untargeted RNA-Seq of Urine Samples Diagnose UTIs in Children?”.

Loading packages

library(DESeq2)
library(ggplot2)
library(biomaRt)
library(EnhancedVolcano)
library(factoextra)
library(org.Hs.eg.db)
library(ensembldb)
library(tximport)
library(AnnotationHub)
library(clusterProfiler)
library(AnnotationDbi)
library(ggvenn)
library(ComplexHeatmap)
library(reshape2)
library(dplyr)
library(tidyr)
library(readxl)
library(pROC)
library(circlize)
library(ggpubr)
library(effsize)
library(mclust)
library(classInt)
library(patchwork)
library(grid)
library(eulerr)
library(devtools)
library(xCell)
library(circlize)

Load in necessary objects

load("load_before.RData")
#' Contains: 
#' - txi.data: uses tximport to convert Salmon gene expression data into R objects
#' - reads.meta.counts: patient metadata appended to reads-per-million kraken-derived information on all species present in these samples
#' - rpmdata: just reads-per-million kraken-derived information on all species present in these samples

0. Proportion of human and non-human reads across the patients

# Obtain reads data
reads <- reads.meta.counts$total.reads
human.reads <- reads.meta.counts$human.reads
data <- data.frame(
  patient = 1:length(human.reads),
  non_human_reads = reads - human.reads,
  human_reads = human.reads
)

# Compute proportion of human reads
data$human_read_proportion <- data$human_reads / reads

# Order patients by human read proportion
data <- data %>% arrange(human_read_proportion)

# Convert to long format for stacked bar plot
long_data <- data %>%
  pivot_longer(cols = c(human_reads, non_human_reads), names_to = "Read Type", values_to = "value")

# Ensure "human_reads" is the first level so it appears at the bottom
long_data$`Read Type` <- factor(long_data$`Read Type`, levels = c("human_reads", "non_human_reads"))

# Create the stacked bar plot
stacked_bar <- ggplot(long_data, aes(x = factor(patient, levels = data$patient), y = value, fill = `Read Type`)) +
  geom_col(position = "stack") +
  scale_fill_manual(values = c("human_reads" = "blue", "non_human_reads" = "red")) +
  labs(x = "214 Patients", y = "Number of Reads", fill = "Read Type") +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15)
  )


# Create a thin heatmap at the bottom
heatmap <- ggplot(data, aes(x = factor(patient, levels = data$patient), y = 1, fill = human_read_proportion)) +
  geom_tile() +
  scale_fill_gradient(low = "grey", high = "darkgreen", 
                      limits = c(0, 1),
                      breaks = c(0, 0.25, 0.5, 0.75, 1)) +
  labs(fill = "Human Read Proportion") +
  theme_void() +
  theme(
    legend.position = "bottom",
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 12), 
    legend.key.height = unit(0.5, "cm"),  
    legend.key.width = unit(1.5, "cm"),
    aspect.ratio = 0.05,
    legend.title.align = -1, 
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
  )

# Combine both plots using patchwork with adjusted heights
final_plot <- stacked_bar / heatmap + plot_layout(heights = c(4, 1))

final_plot

1. Salmon-based unsupervised host-response analysis

  1. Filter out patients with <= 5000 human genes, and genes with mean count < 1, then run DESeq2 with no condition (unsupervised)
# ID patients that express over 5000 human genes
human.gene.counts <- apply(txi.data$counts, 2, function(x) sum(x != 0))
human.gene.over.thres <- subset(human.gene.counts, human.gene.counts > 5000)
human.gene.over.thres <- human.gene.over.thres[na.omit(names(human.gene.over.thres))]

# Subset txi.data to only include these patients
txi.data$abundance <- subset(txi.data$abundance, select = c(names(human.gene.over.thres)))
txi.data$counts <- subset(txi.data$counts, select = c(names(human.gene.over.thres)))
txi.data$length <- subset(txi.data$length, select = c(names(human.gene.over.thres)))

# Calculate the mean counts of each gene
gene_means <- rowMeans(txi.data$counts)

# Filter out genes with mean counts less than 1
txi.data$counts <- txi.data$counts[gene_means > 1, ]
txi.data$abundance <- txi.data$abundance[gene_means > 1, ]
txi.data$length <- txi.data$length[gene_means > 1, ]

print(dim(txi.data$abundance))
## [1] 41617   214
col_data <- data.frame(row.names = colnames(txi.data$counts), condition = factor(rep(1, ncol(txi.data$counts))))
col_data <- cbind(col_data, reads.meta.counts)
dds.unsupervised <- DESeqDataSetFromTximport(txi.data, colData = col_data, design = ~1)
dds.unsupervised <- DESeq(dds.unsupervised)
  1. Stabilize the data and run PCA
normalized_counts <- counts(dds.unsupervised, normalized = TRUE)
vsd <- vst(dds.unsupervised, blind = TRUE)
pca_data <- plotPCA(vsd, returnData = TRUE)
pca_df <- data.frame(
  PC1 = pca_data$PC1,
  PC2 = pca_data$PC2,
  Sample = rownames(pca_data)
  )

TEST b2. again, but plot 3D pca and see what this shows

library(plotly)

# mimic plotPCA internals to get more PCs
ntop <- 500
rv <- rowVars(assay(vsd))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))

# make data frame for plotting
pca_df_3d <- data.frame(
  PC1 = pca$x[,1],
  PC2 = pca$x[,2],
  PC3 = pca$x[,3]
)

# interactive 3D plot
plot_ly(
  data = pca_df_3d,
  x = ~PC1, y = ~PC2, z = ~PC3,
  type = "scatter3d",
  mode = "markers"
)
summary(pca)$importance[2, 1]
## [1] 0.69163
  1. k-means clustering
# Run k-means clustering, k = 2
kmeans_result_pca <- kmeans(pca_df[ , c("PC1", "PC2")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df$cluster <- as.factor(kmeans_result_pca$cluster)

cluster_colors <- c("red", "blue")  # Colorblind-friendly red & blue

# Create the plot comparing PC1 and PC2
k_means_plot_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = factor(cluster))) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(aes(fill = factor(cluster)),   geom = "polygon",
  alpha = 0.3,
  color = "black",    # outline to make visible
  level = 0.95,
  show.legend = TRUE) +
  labs(
    title = "PCA Plot with K-means Clusters (k = 2)",
    x = "PC1",
    y = "PC2",
    color = "Cluster",
    fill = "Cluster"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold"),
    legend.position = "right",
    legend.text = element_text(size = 14),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  ) +
  scale_color_manual(values = cluster_colors) +
  scale_fill_manual(values = cluster_colors)

# Display the plot
k_means_plot_pca

TEST c2: k-means clustering on 3 dimensions

# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df_3d$cluster <- as.factor(kmeans_result_pca_3d$cluster)

# Set color palette
cluster_colours <- c("1" = "red", "2" = "blue")

# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
  data = pca_df_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~cluster,
  colors = cluster_colours,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with K-means Clusters (k = 2)", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
k_means_plot_3d
  1. DESeq2 comparing both groups
# Re-name the clusters to 0/1
patients.over.thres <- names(human.gene.over.thres)
k_groups <- kmeans_result_pca$cluster
k_groups <- ifelse(k_groups == 2, 0, 1)
names(k_groups) <- patients.over.thres

# Create DESeq metadata
k_groups_metadata <- data.frame(group = k_groups, 
                                group_word = sapply(k_groups, function(x) if (x == 1) {return("uti")} else {return("no_uti")}))

# Run DESeq on cluster 1 vs. cluster 2, using the original txi.data
dds.kmeans <- DESeqDataSetFromTximport(txi.data, colData = k_groups_metadata, design = ~group_word)

# Brief moment to run vst again
vst.for.heatmap <- varianceStabilizingTransformation(dds.kmeans, blind = TRUE)

# Run DESeq2
dds.kmeans <- DESeq(dds.kmeans)

# Isolate the volcano info
res.human <- results(dds.kmeans, contrast = c("group_word", "uti", "no_uti"))

# Map ensembl ids to their actual gene names
res.df.human <- as.data.frame(res.human)
res.df.human$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res.df.human), keytype = "ENSEMBL", column = "SYMBOL")

genes_to_label <- c("LIMK2", "CSF3R", "BCL6", "NFKB1", "SELL", "JAK3")

# Create the volcano
volc <- EnhancedVolcano(res.df.human, 
                        x = "log2FoldChange", 
                        y = "padj", 
                        lab = res.df.human$symbol, 
                        selectLab = genes_to_label,  
                        drawConnectors = TRUE, 
                        pointSize = 3.0, 
                        title = "DESeq2 with K-means Groups", 
                        subtitle = "",
                        FCcutoff = 1, 
                        pCutoff = 0.05)

volc

  1. Perform biological process enrichment on these up/down-regulated genes
enrichment <- function(res.df, padj_thres, log2fc_thres, up.or.down, save = FALSE) {

  if (up.or.down == "up") {
    sig.genes <- rownames(res.df[res.df$log2FoldChange > log2fc_thres & res.df$padj < padj_thres, ])
    sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
    title = "Upregulated Pathways"
  } else {
    sig.genes <- rownames(res.df[res.df$log2FoldChange < -log2fc_thres & res.df$padj < padj_thres, ])
    sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
    title = "Downregulated Pathways"
  }
  
  # Run GO analysis
  GO_results <- enrichGO(gene = sig.genes, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")
  GO_results_df <- as.data.frame(GO_results)
  
  # Print p-values for debugging
  print("Original p.adjust values:")
  print(head(GO_results_df$p.adjust))
  
  # Create plot using the enrichResult object (GO_results)
  fit.up <- barplot(GO_results, showCategory = 10) +
    xlab("Number of Genes") + 
    ggtitle(title) + 
    theme(
      plot.title = element_text(size = 15, face = "bold", hjust = 0.5), 
      axis.text.y = element_text(size = 12)
    ) +
    scale_fill_gradient(
      low = "#e73512", 
      high = "#ee794d", 
      name = "Adjusted p-value",
      trans = "log10",  # Log transform for small p-values
      labels = scales::scientific_format()  # Force scientific notation
    )
  
  if (save) {
    ggsave(
      filename = paste0(up.or.down, ".pdf"), 
      plot = fit.up, 
      device = "pdf", 
      width = 12,
      height = 10
    )
  }
  
  print(paste("Number of significant genes:", length(sig.genes)))
  return(fit.up)
}

# Run enrichment function on up/down-regulated genes
upreg.plot <- enrichment(res.df.human, 0.05, 1, "up", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.459751e-40 7.144569e-40 2.801856e-39 3.411580e-39 4.370757e-39
## [6] 8.615696e-39
## [1] "Number of significant genes: 3301"
downreg.plot <- enrichment(res.df.human, 0.05, 1, "down", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.938446e-33 8.969342e-32 4.342786e-28 5.029298e-24 3.873654e-23
## [6] 7.331487e-23
## [1] "Number of significant genes: 6030"

2. Kraken2 and Bracken-based taxonomic classification

  1. Compute pathogen abundance scores
# Remove human reads
df_no_human <- rpmdata[, !colnames(rpmdata) %in% "Homo sapiens"]
total_rpm <- rowSums(df_no_human)

# Normalize dataframe back to RPM
df_renormalized <- sweep(df_no_human, 1, total_rpm, "/") * 1e6
df_renormalized[is.nan(df_renormalized)] <- 0
df_renormalized <- df_renormalized[patients.over.thres , ]

# Seven common UTI-associated pathogens
seven.pathogens <- c("Escherichia coli", "Klebsiella pneumoniae", "Enterococcus faecalis", "Proteus mirabilis", 
                     "Pseudomonas aeruginosa", "Staphylococcus saprophyticus", "Streptococcus agalactiae") 

# Subset to only include reads for these seven pathogens
df_subset <- df_renormalized[ , colnames(df_renormalized) %in% seven.pathogens]
rpm_sums <- rowSums(df_subset, na.rm = TRUE)
rpm_sums <- rpm_sums[patients.over.thres]

# Logs the final summation of normalized pathogen abundance
rpm_sums <- sapply(rpm_sums, function(x) max(0, log10(x)))

# Run jenks classification algorithm to find optimal threshold
pathogen_abundance = rpm_sums
breaks <- classIntervals(pathogen_abundance, n=2, style="jenks")
threshold <- max(breaks$brks[-length(breaks$brks)])
print(paste0("Threshold = ", threshold))
## [1] "Threshold = 4.0107193789017"
# View dist of these abundance scores
hist(rpm_sums, breaks = 12, main = title("Logged RPM, Human Reads Excluded"))
abline(v = threshold, col = "red", lty = 2, lwd = 2)

# View pathogen abundance scores across random y-axis

set.seed(1234)

# Create a data frame with random y-values
abundance.df <- data.frame(
  x = rpm_sums,
  y = runif(length(rpm_sums), min = 0, max = 1), 
  colour = ifelse(rpm_sums > threshold, "Positive", "Negative")
)

rpm.jitter <- ggplot(abundance.df, aes(x = x, y = y, color = colour)) +
  geom_jitter(height = 0, width = 0.1, alpha = 0.6, size = 2.5) +
  geom_vline(xintercept = threshold, color = "black", linetype = "dashed", linewidth = 1) +
  labs(
    title = "RPM Sum Distribution",
    x = expression(log[10] ~ "(Pathogen Abundance Score, RPM)"),
    y = "Random Y-Value",
    color = "Cluster"
  ) +
  scale_color_manual(values = c("Positive" = "red", "Negative" = "blue")) +
  theme_classic(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    axis.title = element_text(),  
    legend.position = "right",
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1) 
    
  )

print(rpm.jitter)

  1. Bubble plot observing pathogen abundance across the patients
heatmap.df <- t(df_renormalized[patients.over.thres, seven.pathogens])
heatmap.df <- apply(heatmap.df, c(1, 2), function(x) max(log10(x), 0))
rownames(heatmap.df) <- sapply(rownames(heatmap.df), function(x) gsub(" ", "\n", x))

# Convert the heatmap.df matrix to a long-format data frame
bubble_data <- melt(heatmap.df)
colnames(bubble_data) <- c("Row", "Column", "Value")

# Get the E coli row values
e_coli_values <- heatmap.df["Escherichia\ncoli", ]

# Reorder the columns based on the E coli values (descending order)
bubble_data$Column <- factor(bubble_data$Column, levels = names(sort(e_coli_values, decreasing = TRUE)))

bubble_plot <- ggplot(bubble_data, aes(x = Column, y = Row, size = Value, fill = Value)) +
  geom_point(shape = 21, color = "black") +
  scale_size_continuous(range = c(1, 10)) +
  scale_fill_gradient(low = "white", high = "red") +  
  labs(
    x = "214 Patients",  
    y = "Pathogen",
    size = "log10(RPM)",
    fill = "log10(RPM)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(), 
    axis.text.y = element_text(size = 12, face = "italic"),
    axis.title.x = element_text(size = 12), 
    axis.title.y = element_text(size = 12), 
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1) 
  )

print(bubble_plot)

  1. ROC and AUC: agreement between Kraken-derived RPM scores and clinical diagnostics
log.reg.df <- data.frame(df_subset)

# Ecoli

log.reg.df$ecoli <- ifelse(reads.meta.counts$ecoli == 1, 1, 0)  
log.reg.df$ecoli[is.na(log.reg.df$ecoli)] <- 0  

ecoli <- glm(ecoli ~ Escherichia.coli, data = log.reg.df, family = binomial)  
ecoli_probs <- predict(ecoli, type = "response")
auc_ecoli <- roc(log.reg.df$ecoli, ecoli_probs)
print(auc_ecoli)
## 
## Call:
## roc.default(response = log.reg.df$ecoli, predictor = ecoli_probs)
## 
## Data: ecoli_probs in 106 controls (log.reg.df$ecoli 0) < 108 cases (log.reg.df$ecoli 1).
## Area under the curve: 0.9895
plot(auc_ecoli, col = "blue", main = "ROC Curve for E. coli")

# Kleb

log.reg.df$kleb <- ifelse(reads.meta.counts$klebsiella == 1, 1, 0)  
log.reg.df$kleb[is.na(log.reg.df$kleb)] <- 0  

kleb <- glm(kleb ~ Klebsiella.pneumoniae, data = log.reg.df, family = binomial)  
kleb_probs <- predict(kleb, type = "response")
auc_kleb <- roc(log.reg.df$kleb, kleb_probs)
print(auc_kleb)
## 
## Call:
## roc.default(response = log.reg.df$kleb, predictor = kleb_probs)
## 
## Data: kleb_probs in 211 controls (log.reg.df$kleb 0) < 3 cases (log.reg.df$kleb 1).
## Area under the curve: 0.7196
plot(auc_kleb, col = "blue", main = "ROC Curve for K. pneumoniae")

# EC

log.reg.df$ec <- ifelse(reads.meta.counts$enterococcus == 1, 1, 0)  
log.reg.df$ec[is.na(log.reg.df$ec)] <- 0  

ec <- glm(ec ~ Enterococcus.faecalis, data = log.reg.df, family = binomial)  
ec_probs <- predict(ec, type = "response")
auc_ec <- roc(log.reg.df$ec, ec_probs)
print(auc_ec)
## 
## Call:
## roc.default(response = log.reg.df$ec, predictor = ec_probs)
## 
## Data: ec_probs in 209 controls (log.reg.df$ec 0) < 5 cases (log.reg.df$ec 1).
## Area under the curve: 0.9943
plot(auc_ec, col = "blue", main = "ROC Curve for E. faecalis")

# Proteus

log.reg.df$prot <- ifelse(reads.meta.counts$proteus == 1, 1, 0)  
log.reg.df$prot[is.na(log.reg.df$prot)] <- 0  

prot <- glm(prot ~ Proteus.mirabilis, data = log.reg.df, family = binomial)  
prot_probs <- predict(prot, type = "response")
auc_prot <- roc(log.reg.df$prot, prot_probs)
print(auc_prot)
## 
## Call:
## roc.default(response = log.reg.df$prot, predictor = prot_probs)
## 
## Data: prot_probs in 209 controls (log.reg.df$prot 0) < 5 cases (log.reg.df$prot 1).
## Area under the curve: 1
plot(auc_prot, col = "blue", main = "ROC Curve for P. mirabilis")

# P. aeruginosa

log.reg.df$pa <- ifelse(reads.meta.counts$mainpathogen == "Pseudomonas Aeruginosa", 1, 0)  
log.reg.df$pa[is.na(log.reg.df$pa)] <- 0  

pa <- glm(pa ~ Pseudomonas.aeruginosa, data = log.reg.df, family = binomial)  
pa_probs <- predict(pa, type = "response")
auc_pa <- roc(log.reg.df$pa, pa_probs)
print(auc_pa)
## 
## Call:
## roc.default(response = log.reg.df$pa, predictor = pa_probs)
## 
## Data: pa_probs in 213 controls (log.reg.df$pa 0) < 1 cases (log.reg.df$pa 1).
## Area under the curve: 1
plot(auc_pa, col = "blue", main = "ROC Curve for P. aeruginosa")

3. How well does host-response and pathogen abundance overlap?

  1. Gene expression heatmap overlayed with pathogen abundance score
# Create a patient order based on k-means group: positive (1) then negative (0)
ordered_k_groups <- k_groups[order(k_groups, decreasing = TRUE)]
ordered_k_groups_name <- names(ordered_k_groups)

# Find up and down genes
sig.genes.up <- rownames(res.df.human[res.df.human$log2FoldChange > 2 & res.df.human$padj < 1e-25 , ])
sig.genes.up <- sig.genes.up[!is.na(sig.genes.up) & !grepl("^NA", sig.genes.up)]

sig.genes.down <- rownames(res.df.human[res.df.human$log2FoldChange < -2 & res.df.human$padj < 1e-25 , ])
sig.genes.down <- sig.genes.down[!is.na(sig.genes.down) & !grepl("^NA", sig.genes.down)]

# Bind these genes together
sig.genes.heatmap <- c(sig.genes.up, sig.genes.down)
sig.genes.heatmap <- sig.genes.heatmap[!is.na(sig.genes.heatmap) & !grepl("^NA", sig.genes.heatmap)]

# Format the z-score-based heatmap fill
temp = t(scale(t(assay(vst.for.heatmap[,])))) 
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0
temp <- temp[sig.genes.heatmap , ]

# Add first col annotation- the k-means cluster
col_anno <- HeatmapAnnotation(k_means_group = ordered_k_groups,
                              col = list(k_means_group = c("0" = "blue", "1" = "red")))

# Add second col annotation, unlogged pathogen abundance score bar plot
rpm_sums_to_use <- rpm_sums[ordered_k_groups_name]
col_anno <- HeatmapAnnotation(
  `Unlogged Pathogen\nAbundance Score` = anno_barplot(10^rpm_sums_to_use, gp = gpar(fill = "gray")),
  `k-means cluster` = ordered_k_groups, 
  col = list(`k-means cluster` = c("0" = "blue", "1" = "red")),
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
  annotation_name_rot = 0
)

# Side annotation

# Do this for binary now to help for the row annotation
binary.up <- rep("1", length(sig.genes.up))
binary.down <- rep("0", length(sig.genes.down))
binary.combined <- c(binary.up, binary.down)
names(binary.combined) <- sig.genes.heatmap

row_anno <- rowAnnotation(
  `Gene Expression` = binary.combined, 
  col = list(`Gene Expression` = c("0" = "blue", "1" = "red")),
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
  annotation_name_rot = 0, 
  annotation_name_offset = unit(2, "mm")
)

# Generate heatmap without x-axis, y-axis labels, and annotation labels
gene.heatmap <- Heatmap(
  temp[ , ordered_k_groups_name],
  show_column_names = FALSE,  
  show_row_names = FALSE, 
  column_order = ordered_k_groups_name,
  top_annotation = col_anno,
  right_annotation = row_anno, 
  heatmap_legend_param = list(
    title = "Z-score\ngene expression",
    labels_gp = gpar(fontsize = 10), 
    title_gp = gpar(fontsize = 10, fontface = "bold"),  
    position = "right", 
    legend_gp = gpar(fontsize = 28)  
))

gene.heatmap

  1. Violin plot comparing pathogen abundance scores across both k-means clusters
# Create a data frame for plotting
plot_data <- data.frame(
  k_means_group = factor(ordered_k_groups, levels = unique(ordered_k_groups)), 
  rpm_sums = rpm_sums_to_use
)

# Assign colors based on k-means group position (left = red, right = blue)
plot_data$point_color <- ifelse(as.numeric(plot_data$k_means_group) <= length(unique(ordered_k_groups)) / 2, "red", "blue")
plot_data$k_means_group <- factor(plot_data$k_means_group, levels = c("1", "0"), labels = c("Group 1", "Group 2"))

# Perform a stat test to confirm what p val is for both samples: 
shapiro_results <- by(plot_data$rpm_sums, plot_data$k_means_group, shapiro.test)
print(shapiro_results)
## plot_data$k_means_group: Group 1
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.51495, p-value < 2.2e-16
## 
## ------------------------------------------------------------ 
## plot_data$k_means_group: Group 2
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.92623, p-value = 1.294e-05
# Since not normal, follow wilcox test:
wilcox_result <- wilcox.test(rpm_sums ~ k_means_group, data = plot_data)

print(wilcox_result$p.value)
## [1] 1.492372e-29
# Effect size calculation using Cliff's Delta
cliffs_delta <- cliff.delta(plot_data$rpm_sums[plot_data$k_means_group == "Group 1"], 
                            plot_data$rpm_sums[plot_data$k_means_group == "Group 2"])

print(cliffs_delta)
## 
## Cliff's Delta
## 
## delta estimate: 0.8935315 (large)
## 95 percent confidence interval:
##     lower     upper 
## 0.8091849 0.9417936
violin <- ggplot(plot_data, aes(x = k_means_group, y = 10^rpm_sums, fill = k_means_group)) +
  geom_violin(alpha = 0.5, scale = "width") +  # Violin plot
  geom_boxplot(width = 0.2, outlier.shape = NA, color = "black", alpha = 0.7) + 
  geom_jitter(aes(color = point_color), width = 0.2, size = 2, alpha = 0.7) + 
  scale_fill_manual(values = c("red", "blue")) +  
  scale_color_identity() + 
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
  ) +
  labs(x = "K-means Group", y = "Pathogen Abundance (RPM)", title = "Violin Plot of RPM Sums by K-means Group")
violin

  1. ROC and AUC: using pathogen abundance scores to predict k-means clusters
par(pty = "s")
cluster <- ifelse(kmeans_result_pca$cluster == 2, 0, 1)
regression_1 <- glm(cluster ~ rpm_sums, data = data.frame(cluster = cluster, rpm_sums), family = binomial)  
regression_1_probs <- predict(regression_1, type = "response")
auc_regression_1 <- roc(cluster, regression_1_probs)
print(auc_regression_1)
## 
## Call:
## roc.default(response = cluster, predictor = regression_1_probs)
## 
## Data: regression_1_probs in 110 controls (cluster 0) < 104 cases (cluster 1).
## Area under the curve: 0.9468
plot(auc_regression_1, col = "blue", main = "Pathogen Abundance Score Predicts RNA-Seq Host Response", asp = 1, lwd = 3)

  1. Immune gene expression levels over pathogen abundance classes
generate.scatter <- function(gene, ensemblID, tpm, rpm_sums, threshold, save = FALSE) {
  
  # Get TPM values for selected gene
  score <- tpm[ensemblID, ]
  
  # Create data frame
  scatter.df <- data.frame(rpm_sums, score)
  cor_test <- cor.test(scatter.df$rpm_sums, scatter.df$score)
  
  # Create scatter plot
  scatter <- ggplot(scatter.df, aes(x = rpm_sums, y = score)) + 
    geom_smooth(method = "lm", se = FALSE, color = "red") + 
    geom_point(size = 3) +
    labs(
      title = paste(gene, "Expression vs. UTI Pathogen Abundance"),
      x = "log10(Pathogen RPM Score)",
      y = paste(gene, "TPM")
    ) +
    theme_minimal() + 
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold title
      text = element_text(size = 14), # Increase overall text size
      panel.border = element_rect(color = "black", fill = NA, size = 1.5) # Add border
    )
  
  # Create a grouping variable for threshold
  scatter.df$group <- ifelse(scatter.df$rpm_sums >= threshold, "Positive", "Negative")
  
  # Create the jitter + boxplot with t-test annotation
  jitter_boxplot <- ggplot(scatter.df, aes(x = group, y = score, color = group, fill = group)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.4, color = "black") +  # Boxplot without outliers
    geom_jitter(width = 0.2, size = 3, alpha = 0.7) +
    scale_color_manual(values = c("Negative" = "blue", "Positive" = "red")) +
    scale_fill_manual(values = c("Negative" = "blue", "Positive" = "red")) +
    labs(
      title = paste(gene, "Expression by Pathogen Abundance Group"),
      x = "Pathogen Group",
      y = paste(gene, "TPM")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      text = element_text(size = 14), 
      panel.border = element_rect(color = "black", fill = NA, size = 1.5)
    ) +
    stat_compare_means(method = "t.test", label = "p.format", 
                       label.y = max(scatter.df$score) * 1.05)
  
  # Save plots if needed
  if (save) {
    output_file = paste0(gene, ".pdf")
    pdf(output_file, width = 8, height = 6)
    print(scatter)
    print(jitter_boxplot)
    dev.off()
  }
  
  # Return plots as a list
  return(list(scatter = scatter, jitter = jitter_boxplot))
}

# Use sample genes
NFKB2 <- generate.scatter("NFKB2", "ENSG00000077150", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
JAK3 <- generate.scatter("JAK3", "ENSG00000105639", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
LIMK2 <- generate.scatter("LIMK2", "ENSG00000182541", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
CSF3R <- generate.scatter("CSF3R", "ENSG00000119535", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
IL8 <- generate.scatter("IL8", "ENSG00000169429", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
TNF <- generate.scatter("TNF", "ENSG00000232810", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
HLAA <- generate.scatter("HLAA", "ENSG00000206503", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)

# Print them
print(NFKB2$jitter)

print(JAK3$jitter)

print(LIMK2$jitter)

print(CSF3R$jitter)

print(IL8$jitter)

print(TNF$jitter)

print(HLAA$jitter)

4. How well does RNA-seq agree with clinical UTI diagnostics?

  1. Comparing with clinical: venn diagrams proportional to sizes of groups
# First, create the df containing all four binary classifications (venn.df)

# 1 -> RNA-seq pathogen abundance
rpm_sums_binary <- ifelse(rpm_sums > threshold, 1, 0)

# 2 -> Clinical host response (leukocyte esterase)
enrleuks <- reads.meta.counts$enrleuks
binary.enrleuks <- ifelse(enrleuks == 1 | is.na(enrleuks), 0, 1)

# 3 -> RNA-seq host response (k-means clusters, already computed above in 'cluster' object)

# 4 -> Clinical pathogen abundance
clin <- reads.meta.counts$uti

venn.df <- data.frame(patients.over.thres, 
                      cluster, 
                      rpm_sums_binary, 
                      binary.enrleuks, 
                      clin)

# remove patient 20070, because they do not have leukocyte esterase activity data (n = 213 now)
venn.df <- venn.df %>% filter(patients.over.thres != 20070)

# This will only be used for the ROC analysis predicting leukocyte esterase
rpm_sums_temp <- rpm_sums[names(rpm_sums) != 20070]

# Add an all-zero row to count patients with no group memberships
venn.df <- venn.df[ , c(2, 3, 4, 5)]

colnames(venn.df) <- c("RNA-Seq Host Response Cluster", "RNA-Seq Pathogen Abundance", 
                       "Clinical Host Response Score", "Clinical Pathogen Abundance")

# Generate Euler diagram
new.venn <- euler(venn.df)

# Plot four overlapping categories
plot(new.venn, 
     quantities = TRUE, 
     fills = c("cornflowerblue", "lightcoral", "goldenrod1", "mediumseagreen"),
     alpha = 0.6, 
     labels = list(font = 2, cex = 1.2), 
     edges = list(col = "black", lwd = 2),
     legend = TRUE)

rnaseq_venn <- euler(venn.df[ , c(1,2)])


par(mar = c(5, 5, 5, 5)) 

# Plot RNA-seq host response versus pathogen abundance
plot(rnaseq_venn, 
     quantities = list(cex = 1.5),
     fills = c("red", "blue"), 
     alpha = 0.6,    
     labels = FALSE,
     edges = list(col = "black", lwd = 2),  
     legend = TRUE, 
     quantities.cex = 10)  # Increase text size

# Plot clinical host response versus pathogen abundance
clinical_venn <- euler(venn.df[ , c(3,4)])
plot(clinical_venn, 
     quantities = list(cex = 1.5),
     fills = c("green", "purple"), 
     alpha = 0.6,  
     edges = list(col = "black", lwd = 2), 
     legend = TRUE,
     quantities.cex = 10) 

  1. Comparing with clinical: same venn diagrams (not proportional)
# Create a list of sets for all positive diagnoses across groups
sets <- list(
  "RNA-Seq Host\nResponse Score" = which(venn.df[ , 1] == 1),
  "RNA-Seq\nTaxonomic Score" = which(venn.df[ , 2] == 1),
  "Clinical Host Response\n(Leukocyte Esterase)" = which(venn.df[ , 3] == 1),
  "Clinical Taxonomic\nScore (Culture)" = which(venn.df[ , 4] == 1)
)

# Plot the Venn diagram
venn_plot <- ggvenn(sets, 
       fill_color = c("red", "blue", "green", "purple"), 
       text_size = 5, 
       set_name_size = 5)

# Should add a count for all negative patients
all_zero_count <- sum(rowSums(venn.df) == 0)

# Add annotation to the plot
venn_plot +
  annotate(
    "text", 
    x = 0.5, y = -2,
    label = paste("All zero count:", all_zero_count),
    size = 5,
    hjust = 0.5
  )

# 1) -> Just RNA-Seq

# Create a list of sets
sets1 <- list(
  "RNA-Seq HR" = which(venn.df[ , 1] == 1),
  "RNA-Seq Tax" = which(venn.df[ , 2] == 1))

# Plot the Venn diagram
venn_plot1 <- ggvenn(sets1, 
                    fill_color = c("red", "blue"), 
                    text_size = 4, 
                    set_name_size = 4)

venn_plot1

# 2) -> Just Clinical

# Create a list of sets
sets2 <- list(
  "Clinical HR" = which(venn.df[ , 3] == 1),
  "Clinical Tax" = which(venn.df[ , 4] == 1))

# Plot the Venn diagram
venn_plot2 <- ggvenn(sets2, 
                     fill_color = c("green", "purple"), 
                     text_size = 4, 
                     set_name_size = 4)

venn_plot2

  1. ROC and AUC: using pathogen abundance scores to predict clinical labels
# Predicting clinical host response
par(pty = "s")
regression_2 <- glm(venn.df$`Clinical Host Response Score` ~ rpm_sums_temp, data = data.frame(`Clinical Host Response Score` = venn.df$`Clinical Host Response Score`, rpm_sums_temp), family = binomial)
regression_2_probs <- predict(regression_2, type = "response")
auc_regression_2 <- roc(venn.df$`Clinical Host Response Score`, regression_2_probs)
print(auc_regression_2)
## 
## Call:
## roc.default(response = venn.df$`Clinical Host Response Score`,     predictor = regression_2_probs)
## 
## Data: regression_2_probs in 99 controls (venn.df$`Clinical Host Response Score` 0) < 114 cases (venn.df$`Clinical Host Response Score` 1).
## Area under the curve: 0.9527
plot(auc_regression_2, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Host Response", lwd = 3)

# Clinical taxonomy
regression_3 <- glm(venn.df$`Clinical Pathogen Abundance` ~ rpm_sums_temp, data = data.frame(`Clinical Pathogen Abundance` = venn.df$`Clinical Pathogen Abundance`, rpm_sums_temp), family = binomial)
regression_3_probs <- predict(regression_3, type = "response")
auc_regression_3 <- roc(venn.df$`Clinical Pathogen Abundance`, regression_3_probs)
print(auc_regression_3)
## 
## Call:
## roc.default(response = venn.df$`Clinical Pathogen Abundance`,     predictor = regression_3_probs)
## 
## Data: regression_3_probs in 96 controls (venn.df$`Clinical Pathogen Abundance` 0) < 117 cases (venn.df$`Clinical Pathogen Abundance` 1).
## Area under the curve: 0.9878
plot(auc_regression_3, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Pathogen", lwd = 3)

  1. Exploring the patients that do not agree
pred.df <- venn.df

provide.grouping <- function(MT_HR, MT_Tax, Clin_HR, Clin_Tax) {
  comb <- paste0(MT_HR, MT_Tax, Clin_HR, Clin_Tax)
  
  # Define meaningful labels for each combination
  group_labels <- list(
    "1111" = "All_Positive",
    "0000" = "All_Negative",
    "1110" = "Clin_Tax_Negative",
    "1101" = "Clin_HR_Negative",
    "1011" = "MT_Tax_Negative",
    "0111" = "MT_HR_Negative",
    "1100" = "Clin_Tax_HR_Negative",
    "1010" = "MT_Tax_Clin_Tax_Negative",
    "1001" = "MT_Tax_Clin_HR_Negative",
    "0110" = "MT_HR_Clin_Tax_Negative",
    "0101" = "MT_HR_Clin_HR_Negative",
    "0011" = "MT_HR_MT_Tax_Negative",
    "1000" = "Only_MT_HR_Positive",
    "0100" = "Only_MT_Tax_Positive",
    "0010" = "Only_Clin_HR_Positive",
    "0001" = "Only_Clin_Tax_Positive"
  )
  
  # Return the appropriate label or NA if not found
  return(group_labels[[comb]])
}

# Apply function to all rows, giving a string value for each
pred.df$group <- mapply(provide.grouping, 
                        pred.df[ , 1],
                        pred.df[ , 2],
                        pred.df[ , 3], 
                        pred.df[ , 4])

# Working with df_renormalized, FYI
incorrect_patients <- rownames(pred.df[pred.df$group != "All_Negative" &
                                       pred.df$group != "All_Positive", ])
  
df_renormalized_subset <- df_renormalized[incorrect_patients , ]

# Function that will generate a urobiome plot on this new df
generate.urobiome.plot.with.matrix <- function(rpm, patients, rpm.threshold, metadata) {
  # These are required
  require(ggplot2)
  require(reshape2)
  require(cowplot)
  require(dplyr)
  require(RColorBrewer)
  
  # 1. Process microbiome data
  rpm.subsetted <- rpm[patients, ]
  
  sum.above.thres <- function(patient) {
    current.row <- rpm.subsetted[patient, ]
    patients.above <- current.row[current.row >= rpm.threshold]
    sum.rpm.above <- sum(patients.above)
    sum.rpm.below <- 1e6 - sum.rpm.above
    patients.above <- c(patients.above, "other" = sum.rpm.below)
    
    if (sum(patients.above) != 1e6) stop("RPM must sum to 1 million!")
    patients.above <- patients.above[!names(patients.above) %in% "Homo sapiens"]
    return(patients.above / sum(patients.above))
  }
  
  stacked_bar_data <- lapply(patients, sum.above.thres)
  names(stacked_bar_data) <- patients
  
  # 2. Create plotting data
  plot_data <- bind_rows(lapply(seq_along(stacked_bar_data), function(i) {
    data.frame(t(stacked_bar_data[[i]]), Patient = patients[i], stringsAsFactors = FALSE)
  })) %>% 
    reshape2::melt(id.vars = "Patient", variable.name = "Taxa", value.name = "Abundance") %>% 
    mutate(Patient = factor(Patient, levels = patients))
  
  # 3. Create color palette
  taxa_list <- setdiff(unique(plot_data$Taxa), "other")
  other_colors <- colorRampPalette(brewer.pal(12, "Set3"))(length(taxa_list))
  fill_colours <- c("other" = "grey", setNames(other_colors, taxa_list))
  
  # 4. Create main plot with its own legend
  p_main <- ggplot(plot_data, aes(x = Patient, y = Abundance, fill = Taxa)) +
    geom_bar(stat = "identity", width = 0.7) +
    coord_flip() +
    scale_fill_manual(values = fill_colours) +
    labs(x = "Patient", y = "Proportion of Non-Human Reads") +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
  
  # 5. Create test matrix with colored symbols
  test_long_names <- c(
    "MT_HR" = "RNA-seq\nHost\nResponse",
    "MT_Tax" = "RNA-seq\nPathogen\nAbundance",
    "Clin_HR" = "Clinical\nHost\nResponse",
    "Clin_Tax" = "Clinical\nPathogen\nAbundance"
  )
  
  test_results <- data.frame(
    Patient = factor(patients, levels = patients),
    MT_HR = pred.df[patients, "RNA-Seq Host Response Cluster"],
    MT_Tax = pred.df[patients, "RNA-Seq Pathogen Abundance"],
    Clin_HR = pred.df[patients, "Clinical Host Response Score"],
    Clin_Tax = pred.df[patients, "Clinical Pathogen Abundance"],
    stringsAsFactors = FALSE
  ) %>% 
    reshape2::melt(id.vars = "Patient", variable.name = "Test", value.name = "Status")
  
  p_matrix <- ggplot(test_results, aes(x = Test, y = Patient)) +
    geom_text(aes(label = ifelse(Status == 1, "√", "X"), 
                  color = ifelse(Status == 1, "Positive", "Negative")),
              size = 4) +
    scale_x_discrete(position = "top", labels = test_long_names) +
    scale_color_manual(values = c("Positive" = "green4", "Negative" = "red")) +
    theme_minimal() +
    theme(
      axis.title = element_blank(),
      axis.text.y = element_blank(),
      panel.grid = element_blank(),
      axis.text.x = element_text(angle = 0, hjust = 0.5, size = 9),
      legend.position = "right",
      legend.title = element_blank()
    )
  
  # 6. Remove legends from both plots
  p_main <- p_main + theme(legend.position = "none")
  p_matrix <- p_matrix + theme(legend.position = "none")
  
  # 7. Combine plots without legends
  aligned_plots <- align_plots(
    p_main,
    p_matrix,
    align = "hv",
    axis = "lr"
  )
  
plot_grid(
    aligned_plots[[1]],  # Main plot
    aligned_plots[[2]],  # Matrix
    nrow = 1,
    rel_widths = c(3, 3),  # Increase width of matrix plot
    align = "h",
    axis = "tb"
)
}

# Generate main plot without legend
main_plot <- generate.urobiome.plot.with.matrix(
  df_renormalized_subset, 
  incorrect_patients,
  75000,
  reads.meta.counts
)

print(main_plot)

## STILL NEED TO ADD THE LEGEND!!
  1. Colour host-response PCA with clinically-determined culture-based results
# First ensure we keep all samples, including those with NA for mainpathogen
pca_df <- data.frame(
  PC1 = pca_data$PC1,
  PC2 = pca_data$PC2,
  Sample = rownames(pca_data)
  )
pca_df <- merge(pca_df, 
                reads.meta.counts[patients.over.thres, c("ptid", "mainpathogen")], 
                by.x = "Sample", 
                by.y = "ptid",
                all.x = TRUE)

# Create a new column that handles NA values
pca_df$pathogen_group <- ifelse(is.na(pca_df$mainpathogen), 
                                "NA", 
                                as.character(pca_df$mainpathogen))

# Define custom color palette for pathogens (7 colors + grey for NA)
pathogen_colors <- c(
  "E. coli" = "#1F77B4",     # Blue
  "E. coli / Proteus" = "#FF7F0E", # Orange
  "Enterococcus" = "#2CA02C", # Green
  "Klebsiella" = "#D62728", # Red
  "Proteus" = "#9467BD", # Purple
  "Pseudomonas Aeruginosa" = "#8C564B",    # Brown
  "NA" = "#a3a3a3"                     # Grey for missing
)

# Create the plot
pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2)) +
  # Add cluster ellipses first (so points appear on top)
  stat_ellipse(
    aes(group = cluster, fill = factor(cluster)),
    geom = "polygon",
    alpha = 0.1,
    color = NA,
    show.legend = TRUE, 
    level = 0.95
  ) +
  
  # Add points colored by pathogen
  geom_point(
    aes(color = pathogen_group),
    size = 3,
    alpha = 0.8
  ) +
  
  # Custom color scales
  scale_color_manual(
    values = pathogen_colors,
    na.value = "grey70",
    breaks = names(pathogen_colors)[1:7]
  ) +
  
  scale_fill_manual(
    values = c("Cluster1" = "blue", "Cluster2" = "red"),
    guide = guide_legend(override.aes = list(alpha = 0.3))
  ) +
  
  # Labels and theme
  labs(
    title = "PCA by Pathogen",
    x = "PC1",
    y = "PC2",
    color = "Main Pathogen",
    fill = "Cluster"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    legend.position = "right",
    legend.title = element_text(face = "bold", size = 14),
    legend.text = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

pca_plot

### Now create the same plot with 3D ###

# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df_3d$mainpathogen <- ifelse(is.na(pca_df$mainpathogen), 
                                "NA", 
                                as.character(pca_df$mainpathogen))


# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
  data = pca_df_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~mainpathogen,
  colors = pathogen_colors,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with clinical main pathogen", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
k_means_plot_3d
  1. Colour host-response PCA with labels from venn diagram
set.seed(1234)

pca_df_4f <- pca_df
pca_df_4f <- pca_df_4f[pca_df_4f$Sample %in% rownames(pred.df) , ]
if (all(pca_df_4f$Sample == rownames(pred.df))) {
  pca_df_4f$group <- pred.df$group
}

new_cluster <- cluster[-(which(pca_df$Sample == "20070"))]
pca_df_4f$cluster <- kmeans_result_pca$new_cluster

# Create the plot
pca_plot_2 <- ggplot(pca_df_4f, aes(x = PC1, y = PC2)) +
  # Add cluster ellipses first (so points appear on top)
stat_ellipse(
  aes(group = new_cluster, fill = factor(new_cluster)),
  geom = "polygon",
  alpha = 0.3,
  color = "black",    # outline to make visible
  level = 0.95,
  show.legend = TRUE
) +
  
  # Add points colored by pathogen
  geom_point(
    aes(color = group),
    size = 3,
    alpha = 0.8
  ) +
  
  scale_fill_manual(
    values = c("Cluster1" = "blue", "Cluster2" = "red"),
    guide = guide_legend(override.aes = list(alpha = 0.3))
  ) +
  
  # Labels and theme
  labs(
    title = "PCA by venn group",
    x = "PC1",
    y = "PC2",
    color = "Group",
    fill = "Cluster"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    legend.position = "right",
    legend.title = element_text(face = "bold", size = 14),
    legend.text = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

pca_plot_2

### Now create the same plot with 3D ###

pca_df_4f_3d <- pca_df_3d
pca_df_4f_3d <- pca_df_4f_3d[-(which(rownames(pca_df_4f_3d) == "20070")) , ]
pca_df_4f_3d$group <- pca_df_4f$group

# 3D PCA plot with k-means clusters
pca_plot_4f_3d <- plot_ly(
  data = pca_df_4f_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~group,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with venn diagram group colours", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
pca_plot_4f_3d
  1. Take these 8 MT_HR_Negative patients, and compare leukocyte esterase with the all positive group
pca_df_4f$LE_discrete <- reads.meta.counts[-(which(pca_df$Sample == "20070")) , ]$enrleuks

pca_df_4f_filtered <- pca_df_4f[pca_df_4f$group %in% c("All_Positive", "MT_HR_Negative"), ]
ggplot(pca_df_4f_filtered, aes(x = factor(group), y = LE_discrete, fill = factor(group))) +
  geom_violin(trim = FALSE, show.legend = FALSE) +  # Violin plot with no trim and without legend
  geom_boxplot(width = 0.1, color = "black", fill = "white", alpha = 0.6, outlier.shape = NA) +  # Boxplot inside violins
  labs(x = "Discrepancy Group", y = "Absolute Leukocyte Esterase Activity", title = "Distribution of Factor Across All_Positive and MT_HR_Negative") +
  scale_fill_manual(values = c("All_Positive" = "#1f77b4", "MT_HR_Negative" = "#ff7f0e")) +  # Custom colors for the groups
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.border = element_rect(color = "black", fill = NA, size = 1))  # Add a border around the plot

wilcox.test(LE_discrete ~ group, data = pca_df_4f_filtered)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  LE_discrete by group
## W = 668, p-value = 0.0003453
## alternative hypothesis: true location shift is not equal to 0
  1. Cell type enrichment with xCell, mapping on all four prediction types
xCell_txi <- txi.data$abundance
rownames(xCell_txi) <- res.df.human$symbol


xcell_results <- xCellAnalysis(xCell_txi)
## [1] "Num. of genes: 10362"
## No annotation package name available in the input data object.
## Attempting to directly match identifiers in data to gene sets.
## Estimating ssGSEA scores for 489 gene sets.
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
##   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  57%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================| 100%
xcell_results_scaled <- xcell_results[ , colnames(xcell_results) != "20070"]
xcell_results_scaled <- t(scale(t(xcell_results_scaled)))

# Ensure the order matches between pred.df and xcell_results_scaled
binary_annots <- pred.df[, 1:4]
binary_annots <- binary_annots[colnames(xcell_results_scaled), ]  # reordering to match

# Step 1: Get ordering based on the annotation column
ordering <- order(pred.df$`RNA-Seq Host Response Cluster`, decreasing = TRUE)

# Step 2: Reorder both the data and annotations
xcell_results_scaled_ordered <- xcell_results_scaled[, ordering]
binary_annots_ordered <- pred.df[ordering, 1:4]

# Step 3: Rebuild the annotation with reordered rows
ha <- HeatmapAnnotation(
  df = binary_annots_ordered,
  col = list(
    `RNA-Seq Host Response Cluster` = c("0" = "grey", "1" = "red"),
    `RNA-Seq Pathogen Abundance` = c("0" = "grey", "1" = "blue"),
    `Clinical Host Response Score` = c("0" = "grey", "1" = "darkgreen"),
    `Clinical Pathogen Abundance` = c("0" = "grey", "1" = "purple")
  ),
  show_annotation_name = TRUE,
  annotation_name_side = "left"
)

# Step 4: Redraw the heatmap with sorted columns
xcell_heatmap <- Heatmap(
  xcell_results_scaled_ordered,
  top_annotation = ha,
  col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
  cluster_columns = FALSE,
  column_names_gp = gpar(fontsize = 0),   # hide patient IDs
  row_names_gp = gpar(fontsize = 8),      # smaller y-axis labels
  name = "xCell score"                    # rename legend
)


# Draw the heatmap
print(xcell_heatmap)